Let’s say we want to generate numerical values that satisfy the inequalities corresponding to a given profile. This can be done using the code below.
source("compute_profile_means.R")
source("compute_minimum_delta.R")
source("setPowerPointStyle.R")
setPowerPointStyle()
## Warning in par(tmag = 1.2): graphical parameter "tmag" is obsolete
PROFCODES = read.table("profile_codes_v2.txt",header = TRUE,sep = "\t") #contains the definitions of all profiles
load("constraints_vector")
prof_index = 3 #index of profile to be simulated (corresponds to an emergent synergy)
ntimes = 5 #n. of simulations
exp_min = 2 #min range of expression value
exp_max = 16 #max range of expression value
min_delta = 1 #minimum non-zero difference among any two comparisons
profile_means = compute_profile_means(PROFCODES, prof_index, ntimes, exp_min,
exp_max, constraints_vector, min_delta)[ , 1:4]
head(profile_means)
##
## [1,] 3.979122 3.979122 3.979122 6.295081
## [2,] 10.146223 10.146223 10.146223 14.031306
## [3,] 9.992885 9.992885 9.992885 14.449260
## [4,] 8.793068 8.793068 8.793068 14.307622
## [5,] 8.624653 8.624653 8.624653 14.295739
source("setPowerPointStyle.R")
setPowerPointStyle()
colnames(profile_means)=c("0","X","Y","X+Y")
barplot(profile_means[1,], ylab = 'simulated expression')
add = profile_means[1,1] + (profile_means[1,2] - profile_means[1,1]) + (profile_means[1,3] - profile_means[1,1])
abline(h = add, col="red")
After computing the means for a given profile, we can generate random samples resembling real data. We assume that real data come from normal distributions centered around the means computed above. The standard deviation is passed as a parameter, so it is possible to simulate arbitrary noise levels.
source("simulate_from_means.R")
source("setPowerPointStyle.R")
setPowerPointStyle()
samples = 4 #number of samples for each condition that will be simulated
noise_level = 0.5 #this means that the signal-to-noise is delta/noise_level = 1/0.5
design = factor(c(rep("0", samples), rep("X", samples), rep("Y", samples), rep("Y+X", samples)))
simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)
names(simulated_values) = design
boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')
stripchart(simulated_values ~ design, vertical = TRUE,
method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)
Example with more noise.
source("setPowerPointStyle.R")
setPowerPointStyle()
noise_level = 1 #this means that the signal-to-noise is delta/noise_level = 1/1
simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)
boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')
stripchart(simulated_values ~ design, vertical = TRUE,
method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)
The data file should have the first two columns with annotation (for example probe ID and gene Symbol). Numerical data start from column 3.
data_file = "TNF_IFN_2.csv"
my_data = read.csv(data_file,sep = '\t')
head(my_data)
## probe gene CTRL CTRL.1 CTRL.2 IFN.2.5 IFN.2.5.1 IFN.2.5.2
## 1 RFC2 RFC2 8.324769 8.324769 8.132764 8.375789 8.373458 8.324769
## 2 HSPA6 HSPA6 5.413300 4.830096 5.929786 8.329249 8.006876 7.950832
## 3 PAX8 PAX8 5.594199 5.581641 5.581641 5.661038 5.581641 5.581641
## 4 UBA7 UBA7 7.361664 7.307654 7.336399 7.336399 7.217897 7.197828
## 5 CCL5 CCL5 9.985708 8.999638 10.371103 4.692704 4.517400 4.996733
## 6 ESRRA ESRRA 6.335024 6.373706 6.539027 6.630504 6.847058 6.778308
## TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1 TNF.IFN.2.5.2
## 1 8.324769 8.462647 8.386247 8.324769 8.338911 8.192397
## 2 6.248139 6.207699 5.940508 7.677756 7.484339 7.540568
## 3 5.581641 5.581641 5.581641 5.504055 5.411538 5.545741
## 4 7.671212 7.607462 7.470790 7.420842 7.336399 7.095814
## 5 12.476331 12.450789 12.326590 6.663396 5.642317 6.052862
## 6 6.155354 6.370346 6.397341 6.975653 6.886920 6.539027
#get numeric data
expression_data = my_data[,-(1:2)]
if (max(expression_data)>25) expression_data = log2(expression_data)
#removing uninformative probes (very small coefficient of variation)
cof_cutoff = 0.05
cof = apply(expression_data, 1, function(x) sd(x)/mean(x))
cof_filter = which(cof > cof_cutoff)
#graphical parameters
source("setPowerPointStyle.R")
setPowerPointStyle()
my.pca <- prcomp(t(expression_data[cof_filter, ]), center = TRUE, scale=TRUE)
#we assume the same number of samples for each condition
samples = ncol(expression_data)/4
cols = c(rep("black", samples), rep("red", samples),
rep("blue", samples), rep("yellow", samples))
plot(my.pca$x[, 1], my.pca$x[, 2], col = cols,
xlab = "PC1", ylab = "PC2", pch = 20, cex = 1.5, main = data_file)
legend("bottomleft", pch = 20, col = unique(cols),
legend = c("0","X","Y","X+Y"), bty = 'n',cex = 1)
source("filter_data_on_deltas.R")
design = factor(c(rep("0",samples),rep("X",samples),
rep("Y",samples),rep("Y+X",samples)))
my_data_filtered = filter_data_on_deltas(my_data, design = design)
head(my_data_filtered)
## probe gene CTRL CTRL.1 CTRL.2 IFN.2.5 IFN.2.5.1
## 2 HSPA6 HSPA6 5.413300 4.830096 5.929786 8.329249 8.006876
## 5 CCL5 CCL5 9.985708 8.999638 10.371103 4.692704 4.517400
## 10 PXK PXK 9.294279 9.508841 9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737 9.831871 7.027361 7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517 6.384521 7.951326 7.867392
## 14 PIGX PIGX 9.020155 8.955406 9.036642 8.261126 8.799131
## IFN.2.5.2 TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2 7.950832 6.248139 6.207699 5.940508 7.677756 7.484339
## 5 4.996733 12.476331 12.450789 12.326590 6.663396 5.642317
## 10 10.093442 8.939042 9.193210 8.997016 9.972881 10.101698
## 11 7.320024 8.754430 8.966617 8.826732 7.349552 7.153825
## 12 7.562287 6.122274 6.428453 6.443509 8.016323 7.685079
## 14 8.650005 8.832036 8.905883 8.848334 9.028034 9.129482
## TNF.IFN.2.5.2
## 2 7.540568
## 5 6.052862
## 10 10.142173
## 11 7.248851
## 12 7.787337
## 14 8.781471
source("find_optimal_match.R")
my_data_filtered_matched = find_optimal_match(my_data_filtered)
head(my_data_filtered_matched)
## probe gene CTRL CTRL.1 CTRL.2 IFN.2.5 IFN.2.5.1
## 2 HSPA6 HSPA6 5.413300 4.830096 5.929786 8.329249 8.006876
## 5 CCL5 CCL5 9.985708 8.999638 10.371103 4.692704 4.517400
## 10 PXK PXK 9.294279 9.508841 9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737 9.831871 7.027361 7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517 6.384521 7.951326 7.867392
## 14 PIGX PIGX 9.020155 8.955406 9.036642 8.261126 8.799131
## IFN.2.5.2 TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2 7.950832 6.248139 6.207699 5.940508 7.677756 7.484339
## 5 4.996733 12.476331 12.450789 12.326590 6.663396 5.642317
## 10 10.093442 8.939042 9.193210 8.997016 9.972881 10.101698
## 11 7.320024 8.754430 8.966617 8.826732 7.349552 7.153825
## 12 7.562287 6.122274 6.428453 6.443509 8.016323 7.685079
## 14 8.650005 8.832036 8.905883 8.848334 9.028034 9.129482
## TNF.IFN.2.5.2 score prof_index
## 2 7.540568 0.290 52
## 5 6.052862 0.536 108
## 10 10.142173 0.426 86
## 11 7.248851 0.500 68
## 12 7.787337 0.660 21
## 14 8.781471 0.598 2
source("visualize_all_profiles.R")
source("setPowerPointStyle.R")
setPowerPointStyle()
visualize_all_profiles(my_data_filtered_matched)
source("generate_all_profiles.R")
generate_all_profiles()